#include "chromosomeNTdata.h"

const int MAX_CHAR_PER_LINE = 10000;
CchromosomeNTdata::CchromosomeNTdata(void)
{
    this->initialization();
}

CchromosomeNTdata::CchromosomeNTdata(const char* Filename)
{
    this->initialization();
    LOG_INFO("Info %d: Enter %s\n", FINE_LOG, Filename);
    //Passing the address may have some problem so I use pointer and new
    if (hasTheExtName(Filename, ".fasta") ||
            hasTheExtName(Filename, ".fna") ||
            hasTheExtName(Filename, ".fa") ||
            hasTheExtName(Filename, ".txt"))
        this->Constructor_Fasta(Filename);
    else
        this->Consrructor_PreSeq(Filename);
    //Change all the nt string to upper case
    toUpperCase(this->caChromosome, this->iChromosome_size);
}

CchromosomeNTdata::~CchromosomeNTdata(void)
{
    delete [] this->caChromosome;
}
int CchromosomeNTdata::initialization(void)
{
    this->caChromosome = NULL;
    this->SlideWindowStart = 0;
    this->iChromosome_size = 0;
    this->end = false;
    return(0);
}


int CchromosomeNTdata::Constructor_Fasta(const char* Filename)
{
    filebuf *pbuf = NULL;
    long fileSize = 0;
    char *buffer = NULL;
    strcpy(caInputFileName, Filename);

    ifstream ifile(Filename);
    pbuf = ifile.rdbuf();
    // fileSize is larger than this->iChromosome_size with ratio 41:40 in windows
    fileSize = pbuf->pubseekoff(0, ios::end, ios::in);
    this->iChromosome_size = fileSize;
    // GetsizeofChromsome will get exactly bp, which won't include space, but it is super slow DON"T USE IT
    // this->iChromosome_size = GetsizeofChromosome(Filename);
    this->caChromosome = new char[this->iChromosome_size + 1];
    memset(this->caChromosome, 0x00, sizeof(char)*(this->iChromosome_size + 1));
    this->caChromosome[0] = '\0';
    pbuf->pubseekpos(0, ios::in); //Set to the beginning

    if ( this->iChromosome_size > 0) {
        time_t start, end;
        time(&start);
        // Load file directly to the buffer
        pbuf->sgetn(this->caChromosome, this->iChromosome_size);
        // move character within the array from i->j, to exclude bad char
        this->removedNonACGTNBaseAndCollectGeneName();
        // Alternatively, read line by line is very slow. using this->readFastaFileLineByLine(&ifile);
        time(&end);
        LOG_INFO("Info %d: %u seconds consumed.\n", CONFIG_LOG, (unsigned int)(end - start));
    }
    ifile.close();

    delete[] buffer;

    return(this->iChromosome_size);
}

// Read line by line and concatenate
// currently not used and need test
int CchromosomeNTdata::readFastaFileLineByLine(ifstream &ifile)
{
    char* pch;
    char caBuffer[MAX_CHAR_PER_LINE];
    unsigned int length_counter = 0;
    ifile.getline(caBuffer, MAX_CHAR_PER_LINE - 1);

    pch = strtok(caBuffer, " ,\t|"); //This should be the name
    if (pch[0] == '>') {
        do {
            pch = NULL;
            pch = strtok(NULL, " ,\t"); //This should be the name
            if (pch == NULL)
                break;
        } while (1);
        // Get the information in the header lines. Assume each header is a tag of a new Gene.
    }

    do {
        ifile.getline(caBuffer, MAX_CHAR_PER_LINE - 1);
        length_counter += (int)strlen(caBuffer);
        strcat(this->caChromosome, caBuffer);
    } while (ifile.eof() == false && length_counter <= this->iChromosome_size); //double check

    this->caChromosome[length_counter] = '\0';
    this->iChromosome_size = length_counter;
    return (0);
}

// This is the function that read-in pre-process sequence file by Sunje.
int CchromosomeNTdata::Consrructor_PreSeq(const char* Filename)
{
    unsigned int temp = 0;;
    FILE *fp = fopen(Filename, "r");
    strcpy(caInputFileName, Filename);
    ch_header header;

    if (fp == NULL) {
        printf("fail to open %s contig file.\n", Filename);
        exit(-1);
    }
    fread((void*)&header, sizeof(ch_header), 1, fp);

    this->iChromosome_size = header.size;
    this->caChromosome = new char[this->iChromosome_size+1];
    memset(this->caChromosome, 0x00, sizeof(char)*(this->iChromosome_size + 1));

    //LOG_INFO("Info %d: Filename = %s\n", CONFIG_LOG, Filename);
    if ((temp = (unsigned int)fread(this->caChromosome, 1, this->iChromosome_size, fp))
            != this->iChromosome_size) {
        LOG_INFO("Info %d: fail to read %s contig file.\n", CONFIG_LOG, Filename);
        exit(-1);
    }
    fclose(fp);
    return(this->iChromosome_size);
}

int CchromosomeNTdata::getsizeofChromosome(const char* Filename)
{
    //This will getsize from a fasta file
    char* pch;
    int Chromosome_size = 0;//local variable, set after returned
    char caBuffer[MAX_CHAR_PER_LINE];

    ifstream ifile;
    ifile.open(Filename);

    ifile.getline(caBuffer, MAX_CHAR_PER_LINE - 1);
    pch = strtok(caBuffer, " ,\t|"); //This should be the name

    if (pch[0] == '>') {
        do {
            pch = NULL;
            pch = strtok(NULL, " ,\t"); //This should be the name
            if (pch == NULL)
                break;
        } while (1);
        //Get the information in the first header line
    }

    do {
        ifile.getline(caBuffer, MAX_CHAR_PER_LINE - 1);
        Chromosome_size += (int)strlen(caBuffer);
    } while (ifile.eof() == false);

    ifile.close();
    return(Chromosome_size);
}

// private function for copy a line in a buffer from a large buffer
// return the length for the substring being copied
int sgetline(const char* sourceBuf, char* destinationBuf)
{
    int i = 0;
    for (i = 0; sourceBuf[i] != '\n' && sourceBuf[i] != EOF; i++) {
        destinationBuf[i] = sourceBuf[i];
    }
    destinationBuf[i] = '\0';
    return(i);
}

// This function will filter out strange bases in the chromosome
// Record multiple tags in a vector for future translation
unsigned int CchromosomeNTdata::removedNonACGTNBaseAndCollectGeneName(void)
{
    // move base from i to j if it is ACGTN or some special nucleotide symbol
    // reomve it if it is other symbol
    unsigned int i, j;
    for (i = 0, j = 0; i < this->iChromosome_size; i++) {
        if (this->caChromosome[i] == '>') {
            // extract the tag line starts with '>' and skip the line
            char tagline[MAX_LINE];
            tagline[0] = '\0';
            i += sgetline(&this->caChromosome[i], tagline);
            // get the geneName: skipping '>' and get first word
            char caGeneName[MAX_LINE];
            caGeneName[0] = '\0';
            sscanf(&tagline[1], "%s", caGeneName);
            if (strlen(caGeneName) == 0) {
                sprintf(caGeneName, "%s:%d", caGeneName, (int)this->geneVec.table.size());
            }

            filenameLize(caGeneName);
            if (j != 0) { // avoid adding N in the begining
                this->caChromosome[j] = 'N';
                j++; // use an 'N' to separate the gene, to avoid mapping accrose the junction.
            }
            this->geneVec.table.push_back(CGene(string(caGeneName), j)); // record the gene name in a vector
        } else if (isACGT(this->caChromosome[i]) || this->caChromosome[i] == 'N') {
            this->caChromosome[j] = this->caChromosome[i];
            j++;
        } else if (isNucleotide(this->caChromosome[i])) {
            this->caChromosome[j] = 'N'; // replace special nucleotide Symbol to 'N';
            j++;
        } else if (this->caChromosome[i] == EOF) {
            break;
        }
    }
    this->caChromosome[j] = '\0';
    this->iChromosome_size = j;
    return(j - i);
}

char* CchromosomeNTdata::fragKmer(unsigned int uiKmer_Length)
{
    unsigned int window_end = this->SlideWindowStart + uiKmer_Length - 1; // Last possition of the sliding window
    if (window_end >= this->iChromosome_size || this->caChromosome[window_end] == '\0') {
        this->end = true;
        this->caKmer[0] = '\0';
    } else {
        strncpy(this->caKmer, &(this->caChromosome[this->SlideWindowStart]), uiKmer_Length);
        this->caKmer[uiKmer_Length] = '\0';
        this->SlideWindowStart++;//Only shift one
    }
    return(this->caKmer);//Simply return the
}

char* CchromosomeNTdata::fragACGTKmer(unsigned int uiKmer_Length)
{
    unsigned int i = 0;
    while ( i < uiKmer_Length) {
        if (isACGT(this->caChromosome[this->SlideWindowStart + i])) {
            this->caKmer[i] = this->caChromosome[this->SlideWindowStart + i];
            i++;
        } else if ((int)i > _MAX_KMER_LENGTH_ ) {
            cout << "Buf overflow " << endl;
            break;
        } else {//Meet some non ACGT base
            if (this->caChromosome[this->SlideWindowStart+i] == '\0'
                    || this->SlideWindowStart + i >= this->iChromosome_size) {
                this->caKmer[0] = '\0';
                this->end = true;
                return(this->caKmer);// The end of the chromosome
            } else {
                this->SlideWindowStart += (i + 1);//Skip the non ACGT base
                i = 0;//Start over again
            }
        }
    }
    this->caKmer[uiKmer_Length] = '\0';
    this->SlideWindowStart++;
    return(this->caKmer);// The end of the chromosome
}


